import jieba
import jieba.posseg as pseg
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter, defaultdict
import re
import nltk
from textblob import TextBlob
from typing import List
import warnings
warnings.filterwarnings('ignore')

class ContextSemanticAnalyzer:
    def __init__(self, window_size=5):
        self.window_size = window_size
        self.vectorizer = TfidfVectorizer(max_features=1000, stop_words=None)
        
    def extract_contexts(self, text, target_word, window_size=None):
        """提取目标词的前后文本上下文"""
        if window_size is None:
            window_size = self.window_size
            
        # 分词
        words = list(jieba.cut(text))
        contexts = []
        
        # 找到目标词的所有位置
        target_positions = []
        for i, word in enumerate(words):
            if target_word in word:
                target_positions.append(i)
        
        if not target_positions:
            return [], []
        
        # 提取每个位置的前后文本
        before_contexts = []
        after_contexts = []
        
        for pos in target_positions:
            # 前文本
            start_before = max(0, pos - window_size)
            before_text = ' '.join(words[start_before:pos])
            before_contexts.append(before_text)
            
            # 后文本
            end_after = min(len(words), pos + window_size + 1)
            after_text = ' '.join(words[pos + 1:end_after])
            after_contexts.append(after_text)
        
        return before_contexts, after_contexts
    
    def analyze_semantic_difference(self, before_contexts, after_contexts):
        """分析前后文本的语义差异"""
        if not before_contexts or not after_contexts:
            return None
        
        # 合并所有前文本和后文本
        all_before = ' '.join(before_contexts)
        all_after = ' '.join(after_contexts)
        
        if not all_before.strip() or not all_after.strip():
            return None
        
        # 使用TF-IDF向量化
        texts = [all_before, all_after]
        try:
            tfidf_matrix = self.vectorizer.fit_transform(texts)
            
            # 计算余弦相似度
            similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
            
            # 获取特征词
            feature_names = self.vectorizer.get_feature_names_out()
            before_scores = tfidf_matrix[0].toarray()[0]
            after_scores = tfidf_matrix[1].toarray()[0]
            
            # 找出差异最大的词
            diff_scores = np.abs(before_scores - after_scores)
            top_diff_indices = np.argsort(diff_scores)[-10:][::-1]
            
            differential_words = []
            for idx in top_diff_indices:
                if diff_scores[idx] > 0:
                    word = feature_names[idx]
                    before_score = before_scores[idx]
                    after_score = after_scores[idx]
                    differential_words.append({
                        'word': word,
                        'before_score': before_score,
                        'after_score': after_score,
                        'difference': diff_scores[idx],
                        'tendency': 'before' if before_score > after_score else 'after'
                    })
            
            return {
                'similarity': similarity,
                'differential_words': differential_words,
                'before_text': all_before,
                'after_text': all_after
            }
        except Exception as e:
            print(f"分析过程中出现错误: {e}")
            return None
    
    def comprehensive_analysis(self, text, target_word, window_size=5):
        """综合分析词汇前后文本的语义差异"""
        # print(f"正在分析词汇 '{target_word}' 的前后文本语义差异...")
        # print("=" * 60)
        
        # 提取上下文
        before_contexts, after_contexts = self.extract_contexts(text, target_word, window_size)
        results = {}
        if not before_contexts or not after_contexts:
            print(f"未找到词汇 '{target_word}' 或上下文不足")
            results['semantic'] = 0.000
            return results, 0
        
        # print(f"找到 {len(before_contexts)} 个 '{target_word}' 的实例")
        # print(f"窗口大小: {window_size} 词")
        
        
        # 2. 语义相似度分析
        semantic_analysis = self.analyze_semantic_difference(before_contexts, after_contexts)
        # if semantic_analysis:
            # print(f"\n   {target_word}语义相似度分析:")
            # print(f"   前后文本相似度: {semantic_analysis['similarity']:.3f}")
            # print(f"   相似度评价: {'高度相似' if semantic_analysis['similarity'] > 0.7 else '中等相似' if semantic_analysis['similarity'] > 0.4 else '差异较大'}")
            
        results['semantic'] = semantic_analysis
        
        return results, len(before_contexts)

def read_txt_lines(stopwordss: str, filename: str) -> List[str]:
    """
    最简单实用的版本 - 推荐日常使用
    """
    try:
        result = []
        with open(stopwordss, 'r', encoding='utf-8') as stops:
            content = [line1.strip() for line1 in stops if line1.strip()]
            with open(filename, 'r', encoding='utf-8') as file:
                final = [line2.strip() for line2 in file if line2.strip()]
                for item in final:
                    for item2 in content:
                        if item == item2:
                            result.append(item)
        return result
    except Exception as e:
        print(f"读取文件出错: {e}")
        return []

# 使用示例
def main():
    # 创建分析器
    analyzer = ContextSemanticAnalyzer(window_size=5)
    
    # 示例文本
    with open('reasoning.txt', 'r', encoding='utf-8') as original_file:
        sample_text = original_file.read()
        filename = "unique_words.txt"
        stopwordss = "select.txt"
        data = {}
        
        if os.path.exists(filename):
            lines = read_txt_lines(stopwordss, filename)
            print(f"从 {filename} 读取了 {len(lines)} 行数据")
            
            # 显示前几行
            for line in lines:
                target_word = line
                results, times = analyzer.comprehensive_analysis(sample_text, target_word, window_size=200)
                if times > 0:
                    data[target_word] = (results['semantic']['similarity'], times) if results and results['semantic'] else 0
        else:
            print(f"文件 {filename} 不存在，请检查文件路径")
        sorted_data = sorted(data.items(), key=lambda x: x[1][0])
        with open('similarity.txt', 'w', encoding='utf-8') as f:
            for name, (score, times) in sorted_data:
                f.write(f"{name}: {score}, {times}\n")

if __name__ == "__main__":
    main()